import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import os
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
from collections import Counter
from sklearn.metrics import accuracy_score,mean_squared_error,roc_curve,roc_auc_score,classification_report,r2_score,confusion_matrix
#train test split, Grid Search CV
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit,GridSearchCV
from sklearn.model_selection import train_test_split
#for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder
#for one hot encoding with feature-engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
churn = pd.read_csv("Churn_Modelling.csv")
churn.head()
churn.tail()
churn = churn.copy()
churn.info()
churn.dtypes
churn.isna().sum()
# Removing the first three columns from the data set because of their irrelvance'
# churn = churn.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
# churn.head(5)
churn['Exited'].value_counts(normalize=True) # counting the number of customers that churn and those that didn't churn
# Knowing the % of those that have exited and those that have not exited
print(sum(churn["Exited"]) / len(churn["Exited"]))
print(1 - sum(churn["Exited"]) / len(churn["Exited"]))
It shows that 0.2037% of customers as exited
churn.drop(["RowNumber", "CustomerId"], axis=1).describe().T
# Checking for correlation matrix
churn.corr()
plt.style.use("ggplot")
f,ax=plt.subplots(figsize = (15, 10))
sns.heatmap(churn.drop(["RowNumber", "CustomerId"], axis=1).corr(),
robust=True, fmt=' .1g',
linewidths=1.3,
linecolor='green',
annot=True,);
# Visualizing the count of 'exited customers' in the dataset
plt.figure(figsize=(7,8))
sns.countplot(x='Exited', data=churn)
plt.xlabel('0: Customers still with the bank, 1: Customers exited the bank')
plt.ylabel('Count')
plt.title('Customers Churn Visualization')
plt.show()
plt.figure(figsize=(10,8))
sns.countplot(x="HasCrCard",
hue = "Geography",
data = churn, palette="husl");
print(churn.groupby('Geography')['HasCrCard'].sum())
plt.figure(figsize=(10,8))
sns.countplot(x="HasCrCard",
hue = "Exited",
data = churn);
print(churn.groupby('Geography')['HasCrCard'].sum())
fig = px.box(churn, x="Geography", y = "EstimatedSalary",color = 'Exited'); # Another visualization about salary effect
fig.update_layout(title_text="The country with the mean salary-With Outliers(Exited-Not Exited groups)")
fig.show();
plt.figure(figsize = (20,8))
plt.xticks(rotation=90)
plt.title('Credit Card Usage for Ages',color = 'blue',fontsize=15)
sns.countplot(x=churn["Age"],hue = 'HasCrCard',data=churn);
plt.xlabel('Ages')
plt.ylabel('Number of Credit Card Users');
fig = px.box(churn, x="HasCrCard", y = "Age", color= "Exited");
fig.update_layout(title_text = "Credit Card Usage & Age - With Outliers(Exited-Not Exited groups")
fig.show();
fig = px.parallel_categories(churn, dimensions=['Gender', 'Geography', 'Exited'],
color="Exited",
color_continuous_scale=px.colors.sequential.Inferno,
labels={'Gender':'Gender(Female,Male)', 'Exited':'Exited(0:No,1:Yes)'})
fig.update_layout(title_text="Gender-Geography-Exited-Not Exited Schema")
fig.show();
fig = px.parallel_categories(churn, dimensions=['Gender','HasCrCard',"IsActiveMember", 'Exited'],
color="Exited", color_continuous_scale=px.colors.sequential.Inferno,
labels={'HasCrCard':'Has Credit Card', 'Gender':'Gender(Female,Male)', 'Exited':'Exited(0:No,1:Yes)'})
fig.update_layout(title_text="Credit Card-Gender-Exited-Not Exited Schema")
fig.show();
print(churn.groupby("Geography")["CreditScore"].mean())
fig = px.box(churn, x="Geography", y = "CreditScore",color = 'Exited');
fig.update_layout(title_text="The country with the highest credit score(mean)-With Outliers(Exited-Not Exited groups)")
fig.show();
plt.figure(figsize = (14,8));
sns.catplot(x='Geography',
y = "CreditScore",
hue="Exited",
col="Gender",
aspect=1.2,height=5,
kind="swarm", data=df);
plt.figure(figsize = (16,6))
plt.xticks(rotation=45)
sns.scatterplot(x=churn['Age'],y = churn["CreditScore"],hue = "Gender", data=churn);
plt.figure(figsize = (16,6))
plt.xticks(rotation=75)
sns.scatterplot(x=churn['Age'],y = churn["CreditScore"],hue = "Exited", data=churn);
plt.figure(figsize = (20,8))
plt.xticks(rotation=45)
sns.countplot(x=churn["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Number of customers (Exited or not)');
below_30 = churn[churn["Age"]<30]
between_30_40 = churn[(churn["Age"]>=30) & (churn["Age"]<40)]
between_40_50 = churn[(churn["Age"]>=40) & (churn["Age"]<50)]
between_50_60 = churn[(churn["Age"]>=50) & (churn["Age"]<60)]
between_60_70 = churn[(churn["Age"]>=60) & (churn["Age"]<70)]
above_70 = churn[(churn["Age"]>=70)]
k = below_30["Exited"].sum()
l = between_30_40["Exited"].sum()
m = between_40_50["Exited"].sum()
n = between_50_60["Exited"].sum()
o = between_60_70["Exited"].sum()
p = above_70["Exited"].sum()
f,ax = plt.subplots(figsize=(15, 15))
plt.subplot(6,1,1)
sns.countplot(x=below_30["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Customers (Exited)');
plt.xticks(rotation= 30)
plt.subplot(6,1,2)
sns.countplot(x=between_30_40["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Customers (Exited)');
plt.xticks(rotation= 30)
plt.subplot(6,1,3)
sns.countplot(x=between_40_50["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Customers (Exited)');
plt.xticks(rotation= 30);
plt.subplot(6,1,4)
sns.countplot(x=between_50_60["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Customers (Exited)');
plt.xticks(rotation= 30);
plt.subplot(6,1,5)
sns.countplot(x=between_60_70["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Customers (Exited)');
plt.xticks(rotation= 30);
plt.subplot(6,1,6)
sns.countplot(x=above_70["Age"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Age')
plt.ylabel('Customers (Exited)');
plt.xticks(rotation= 30);
plt.figure(figsize = (10,6))
plt.xticks(rotation=45)
sns.barplot(x=churn['Geography'],y = churn["Exited"],hue = "Gender",data=churn, palette="husl");
plt.ylabel('Percetage of people (Exited %)');
plt.figure(figsize = (9,6))
plt.xticks(rotation=45)
sns.countplot(x=churn["Geography"],hue = 'Exited',data=churn, palette="husl");
plt.xlabel('Geo')
plt.ylabel('Number of customers (Exited or not)');
print("Total Number of People By Geography\n",churn["Geography"].value_counts())
print("Number of People Exited By Geography\n",churn[churn['Exited']==1]["Geography"].value_counts(),'\n')
print("Number of People Exited By Gender in Germany \n",churn[(churn['Exited']==1)&(churn['Geography']=='Germany')]["Gender"].value_counts())
print("Number of People Exited By Gender in France \n",churn[(churn['Exited']==1)&(churn['Geography']=='France')]["Gender"].value_counts())
print("Number of People Exited By Gender in Spain \n",churn[(churn['Exited']==1)&(churn['Geography']=='Spain')]["Gender"].value_counts())
plt.figure(figsize = (10,6))
plt.xticks(rotation=75)
sns.scatterplot(x='Age',y = "EstimatedSalary",hue = "Exited",data=churn);
From the plot above, it shows that salary has no effect in exit decision
plt.figure(figsize = (10,6))
plt.xticks(rotation=75)
sns.scatterplot(x='Age',y = "Balance",hue = "Exited",data=churn);
churn = churn.copy()
At this stage, i will create new age groups that will help for perdiction score.
age_group_data = [None] * len(churn['Age'])
for i in range(len(churn['Age'])):
if churn['Age'][i] < 30:
age_group_data[i] = 'Young'
elif churn['Age'][i] >=30 and churn['Age'][i] < 40:
age_group_data[i] = 'Young-Adults'
elif churn['Age'][i] >=40 and churn['Age'][i] < 50:
age_group_data[i] = 'Adults'
elif churn['Age'][i] >=50 and churn['Age'][i] < 60:
age_group_data[i] = 'Elderly-Adults'
elif churn['Age'][i] >=60 and churn['Age'][i] < 74:
age_group_data[i] = 'Old'
else:
age_group_data[i] = 'Very-Old'
churn['age_group'] = age_group_data
I will create a boundary line for credit score
Credit = [None] * len(churn['CreditScore'])
for i in range(len(churn['CreditScore'])):
if churn['CreditScore'][i] < 405:
Credit[i] = 0
else:
Credit[i] = 1
churn['new_credit'] = Credit
churn["new_credit"].value_counts()
sns.factorplot(x = "new_credit", y = "Exited", data = churn ,kind = "bar")
plt.xticks(rotation=75)
plt.ylabel("Exited(Precent)");
g = sns.factorplot(x = "age_group", y = "Exited", data = churn, kind = "bar")
plt.xticks(rotation=45)
g.set_ylabels("Exited")
plt.show()
1 Building dependent and undependent variables 2 Determining Train and Test sets
1 Trying all models of the learning algorithms 2 Setting the parameters 3 Determining best parameters 4 Doing Cross validation 5 Finding Acury score 6 The most effective variables will be determined in each model
1 Visualization of all models’ acury score 2 The model that gives the best results will be determined 3 Research will be done on the mathematical algorithm that creates the best model.
gender_dummies = churn.replace(to_replace={'Gender': {'Female': 0,'Male':1}})
a = pd.get_dummies(churn['Geography'], prefix = "Geo_dummy")
c = pd.get_dummies(churn['age_group'], prefix = "Age_dummy")
frames = [gender_dummies,a,c]
churn = pd.concat(frames, axis = 1)
churn = churn.drop(["RowNumber","Geography","Surname","CustomerId",'Age','age_group','Geography',"CreditScore"],axis = 1)
churn.head()
x = churn.drop(["Exited"],axis = 1) #Independent value
y = churn["Exited"] #Depended value
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler
# fit scaler on training data
norm = MinMaxScaler().fit(x)
# transform independent data
x_norm = norm.transform(x)
x_train, x_test,y_train,y_test = train_test_split(x_norm,y,test_size = 0.3, random_state = 42)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
log_reg = LogisticRegression().fit(x_train,y_train)
y_pred = log_reg.predict(x_test)
log_model = (accuracy_score(y_test,y_pred)*100)
log_model
y_probs = log_reg.predict_proba(x_test)[:,1]
y_pred = [1 if i >0.53 else 0 for i in y_probs]
log_proba_score = (accuracy_score(y_test,y_pred)*100)
print ("log score=",log_proba_score)
confusion_matrix(y_test,y_pred)
log_params = {"C":np.logspace(-3,3,7),
"penalty": ["l1","l2"],
"max_iter":[10,50,500,1000]} #"solver":['lbfgs', 'liblinear', 'sag', 'saga'],
log =LogisticRegression()
log_cv = GridSearchCV(log,log_params,cv = 10)
log_tuned = log_cv.fit(x_train,y_train)
log_tuned.best_params_
log_reg_tuned = LogisticRegression(C=100,max_iter=50,penalty='l2',solver='liblinear').fit(x_train,y_train)
y_probs = log_reg.predict_proba(x_test)[:,1]
y_pred = [1 if i >0.53 else 0 for i in y_probs]
log_tuned_score = (accuracy_score(y_test,y_pred)*100)
print ("log tuned score=",log_tuned_score)
lr_cm = confusion_matrix(y_test,y_pred)
lr_cm
Gaussian NB
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
gnb_model = nb.fit(x_train,y_train)
gnb_model
y_pred = gnb_model.predict(x_test)
nb_score = (accuracy_score(y_test,y_pred)*100)
nb_score
nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}
nb =GaussianNB()
nb_cv = GridSearchCV(nb,nb_params,cv = 10)
nb_cv = nb_cv.fit(x_train,y_train)
nb_cv.best_params_
nb_tuned =GaussianNB(var_smoothing=0.43287612810830584).fit(x_train,y_train)
y_pred = nb_tuned.predict(x_test)
nb_tuned = (accuracy_score(y_test,y_pred)*100)
nb_tuned
nb_cm = confusion_matrix(y_test,y_pred)
nb_cm
from sklearn.neighbors import KNeighborsClassifier
knn =KNeighborsClassifier()
knn_model = knn.fit(x_train,y_train)
knn_model
y_pred = knn_model.predict(x_test)
knn_score = (accuracy_score(y_test,y_pred)*100)
knn_score
knn_params = {"n_neighbors":np.arange(1,50),
"weights": ["uniform","distance"],
"metric":["euclidean","manhattan"]}
knn =KNeighborsClassifier()
knn_cv = GridSearchCV(knn,knn_params,cv = 10)
knn_cv = knn_cv.fit(x_train,y_train)
print("Best Parameters:"+str(knn_cv.best_params_))
knn_final =KNeighborsClassifier(n_neighbors =15,metric='manhattan',weights='distance')
knn_final = knn_final.fit(x_train,y_train)
y_pred = knn_final.predict(x_test)
knn_tuned = (accuracy_score(y_test,y_pred)*100)
knn_tuned
knn_cm = confusion_matrix(y_test,y_pred)
knn_cm
from sklearn.svm import SVC
svm_model_linear = SVC(kernel='linear').fit(x_train,y_train)
svm_model_poly = SVC(kernel='poly').fit(x_train,y_train)
svm_model_rbf = SVC(kernel='rbf').fit(x_train,y_train)
y_pred_linear = svm_model_linear.predict(x_test)
y_pred_poly = svm_model_poly.predict(x_test)
y_pred_rbf = svm_model_rbf.predict(x_test)
print(accuracy_score(y_test,y_pred_linear)*100)
print(accuracy_score(y_test,y_pred_poly)*100)
print(accuracy_score(y_test,y_pred_rbf)*100)
Polinomal kernel(poly)
svc_params = {"C": [1,5,10,50,100,200],
'kernel':['poly','rbf'],
"gamma": [0.001, 0.01, 0.1,0.5],}
svc = SVC()
svc_cv_model = GridSearchCV(svc,svc_params,
cv = 5,
n_jobs = -1,
verbose = 2)
svc_cv_model.fit(x_train,y_train)
print("Best Parameters:"+str(svc_cv_model.best_params_))
svc_tuned = SVC(kernel = "poly",C=50,gamma=0.5).fit(x_train,y_train)
y_pred = svc_tuned.predict(x_test)
svc_tuned_score = (accuracy_score(y_test,y_pred)*100)
svc_tuned_score
confusion_matrix(y_test,y_pred)
##### Radial basis function kernel(rbf)
svc_rbf_tuned = SVC(kernel = "rbf",C=50,gamma=0.1).fit(x_train,y_train)
y_pred = svc_rbf_tuned.predict(x_test)
svc_rbf_score = (accuracy_score(y_test,y_pred)*100)
svc_rbf_score
svm_cm = confusion_matrix(y_test,y_pred)
svm_cm
from sklearn.ensemble import RandomForestClassifier
r_for = RandomForestClassifier().fit(x_train,y_train)
r_for
y_pred = r_for.predict(x_test)
rf_score = accuracy_score(y_test,y_pred)*100
rf_score
rf_params = {'max_depth':list(range(1,10)),
"max_features":["log2","auto","sqrt"],
"n_estimators":[2,10,20,50,150,300],
'criterion' : ['gini','entropy'],
'min_samples_leaf' : [1,3,5,10]}
rf_model = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf_model,
rf_params,
cv = 5,
n_jobs = -1)
rf_cv_model.fit(x_train,y_train)
rf_cv_model.best_params_
rf_tuned = RandomForestClassifier(max_depth = 10,
criterion = 'gini',
max_features = 'log2',
min_samples_leaf = 1,
n_estimators = 150,random_state=42)
rf_tuned = rf_tuned.fit(x_train,y_train)
y_pred = rf_tuned.predict(x_test)
rf_tuned_score = (accuracy_score(y_test,y_pred)*100)
rf_tuned_score
rf_cm = confusion_matrix(y_test,y_pred)
rf_cm
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier()
gbm_model = gbm.fit(x_train,y_train)
gbm_model
y_pred = gbm_model.predict(x_test)
gbm_score = accuracy_score(y_test,y_pred)*100
gbm_score
gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.2],
"n_estimators": [100,200,300,500,1000],
"max_depth": [1,3,5,10],
"min_samples_split": [1,2,5,10]}
gbm = GradientBoostingClassifier()
clf = GridSearchCV(gbm,gbm_params,verbose=0,n_jobs=-1,cv=3)
gb = clf.fit(x_train,y_train)
gb.best_params_
gbm = GradientBoostingClassifier(n_estimators=100,min_samples_split=5,max_depth=3,learning_rate=0.2,random_state=42)
gbm.fit(x_train,y_train)
y_pred = gbm.predict(x_test)
gbm_tuned_score = accuracy_score(y_test,y_pred)*100
gbm_tuned_score
gbm_cm = confusion_matrix(y_test,y_pred)
gbm_cm
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100)
xgb_model = xgb.fit(x_train,y_train)
xgb_model
y_pred = xgb_model.predict(x_test)
xgb_score = accuracy_score(y_test,y_pred)*100
xgb_score
xgb_params ={
'n_estimators': [50, 100, 200],
'subsample': [ 0.6, 0.8, 1.0],
'max_depth': [1,2,3,4],
'learning_rate': [0.1,0.2, 0.3, 0.4, 0.5],
"min_samples_split": [1,2,4,6]}
xgb = XGBClassifier()
xgb = GridSearchCV(xgb,xgb_params,verbose=0,n_jobs=-1,cv=3)
xgb = xgb.fit(x_train,y_train)
xgb.best_params_
xgbm_cv = XGBClassifier(learning_rate=0.3,
max_depth=2,
min_samples_split=1,
n_estimators=100,
subsample=1.0,random_state=42).fit(x_train,y_train)
y_pred = xgbm_cv.predict(x_test)
xgbm_score = (accuracy_score(y_test,y_pred)*100)
xgbm_score
xgbm_cm = confusion_matrix(y_test,y_pred)
xgbm_cm
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier().fit(x_train,y_train)
y_pred = lgbm.predict(x_test)
lgbm_score = (accuracy_score(y_test,y_pred)*100)
lgbm_score
lgbm_params = {"learning_rate" : [0.001,0.01, 0.1],
"n_estimators": [100,200,300,500,1000],
"max_depth": [2,3,5,7],
"min_child_samples": [1,3,5,7]}
lgbm = LGBMClassifier()
lgbm_cv = GridSearchCV(lgbm,lgbm_params,verbose=0,n_jobs=-1,cv=5)
lgbm_cv_model = lgbm_cv.fit(x_train,y_train)
lgbm_cv_model.best_params_
lgbm = LGBMClassifier(learning_rate=0.01,
max_depth=5,
min_child_samples=5,
n_estimators=400)
lgbm_tuned = lgbm.fit(x_train,y_train)
y_pred = lgbm_tuned.predict(x_test)
lgbm_tuned_acc = (accuracy_score(y_test,y_pred)*100)
lgbm_tuned_acc
lgbm_cm = confusion_matrix(y_test,y_pred)
lgbm_cm
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier().fit(x_train,y_train)
y_pred = cat_model.predict(x_test)
y_pred = cat_model.predict(x_test)
catb_final_score =(accuracy_score(y_test,y_pred)*100)
catb_final_score
catb_cm = confusion_matrix(y_test,y_pred)
catb_cm
## We will use also scaler for improving the score of ML algorithms
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x)
x_scaled = scaler.transform(x)
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,
test_size = 0.30,
random_state = 42)
knn_params = {"n_neighbors":np.arange(1,50),
"weights": ["uniform","distance"],
"metric":["euclidean","manhattan"]}
knn =KNeighborsClassifier()
knn_cv = GridSearchCV(knn,knn_params,cv = 5)
knn_cv = knn_cv.fit(x_train,y_train)
print("Best Parameters:"+str(knn_cv.best_params_))
knn_scaled =KNeighborsClassifier(n_neighbors =29,metric='manhattan',weights='distance')
knn_scaled = knn_scaled.fit(x_train,y_train)
y_pred = knn_scaled.predict(x_test)
knn_sscore = (accuracy_score(y_test,y_pred)*100)
knn_sscore
knn_scaled_conf = confusion_matrix(y_test,y_pred)
knn_scaled_conf
svm_scaled_linear = SVC(kernel='linear').fit(x_train,y_train)
svm_scaled_poly = SVC(kernel='poly').fit(x_train,y_train)
svm_scaled_rbf = SVC(kernel='rbf').fit(x_train,y_train)
y_pred_slinear = svm_scaled_linear.predict(x_test)
y_pred_spoly = svm_scaled_poly.predict(x_test)
y_pred_srbf = svm_scaled_rbf.predict(x_test)
print(accuracy_score(y_test,y_pred_slinear))
print(accuracy_score(y_test,y_pred_spoly))
print(accuracy_score(y_test,y_pred_srbf))
svc_params = {"C": [10,50,100,500,700],
'kernel':['poly','rbf'],
"gamma": [0.001, 0.01, 0.1]}
svc = SVC()
svc_cv_model = GridSearchCV(svc,svc_params,
cv = 5,
n_jobs = -1,
verbose = 2)
svc_cv_model.fit(x_train,y_train)
print("Best Parameters:"+str(svc_cv_model.best_params_))
svc_scaled = SVC(kernel = 'rbf',C = 700, gamma = 0.01)
scaled = svc_scaled.fit(x_train,y_train)
y_pred = scaled.predict(x_test)
svm_scaled_score = (accuracy_score(y_test,y_pred)*100)
svm_scaled_score
svc_scaled_conf = confusion_matrix(y_test,y_pred)
svc_scaled_conf
rf_params = {'max_depth':list(range(1,11)),
"max_features":["log2","auto","sqrt"],
"n_estimators":[2,10,20,50,150,300],
'criterion' : ['gini','entropy'],
'min_samples_leaf' : [1,3,5,10]}
rf_model = RandomForestClassifier(random_state = 42)
rf_cv_model = GridSearchCV(rf_model,
rf_params,
cv = 5,
n_jobs = -1)
rf_cv_model.fit(x_train,y_train)
rf_cv_model.best_params_
rf_tuned = RandomForestClassifier(max_depth = 10,
criterion = 'gini',
max_features = 'log2',
min_samples_leaf = 1,
n_estimators = 150,random_state = 42)
rf_tuned = rf_tuned.fit(x_train,y_train)
y_pred = rf_tuned.predict(x_test)
rf_scaled_score = (accuracy_score(y_test,y_pred)*100)
rf_scaled_score
rf_scaled_conf = confusion_matrix(y_test,y_pred)
rf_scaled_conf
lgbm_params = {"learning_rate" : [0.01, 0.02,0.1],
"n_estimators": [100,200,300,500,1000],
"max_depth": [2,3,5,7],
"min_child_samples": [1,2,5,10]}
lgbm = LGBMClassifier()
lgbm_cv = GridSearchCV(lgbm,lgbm_params,verbose=0,n_jobs=-1,cv=5)
lgbm_cv_model = lgbm_cv.fit(x_train,y_train)
lgbm_cv_model.best_params_
lgbm = LGBMClassifier(learning_rate=0.02,
max_depth=5,min_child_samples=5,
n_estimators=500,random_state = 42)
lgbm_tuned = lgbm.fit(x_train,y_train)
y_pred = lgbm_tuned.predict(x_test)
lgbm_scaled_acc = (accuracy_score(y_test,y_pred)*100)
lgbm_scaled_acc
lgbm_scaled_conf = confusion_matrix(y_test,y_pred)
lgbm_scaled_conf
logit_roc_auc = roc_auc_score(y_test,log_reg_tuned.predict(x_test))
fpr, tpr, tresholds = roc_curve(y_test,log_reg_tuned.predict_proba(x_test)[:,1])
plt.figure(figsize=(6,6))
plt.plot(fpr,tpr,label = "AUC (area = %0.2f)"%logit_roc_auc)
plt.plot([0,1],[0,1],"r--")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel("False Positive Ratio")
plt.ylabel("True Positive Ratio")
plt.title('ROC Curve');
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_probs)
fig = plt.figure(figsize=(15,15))
ax1 = fig.add_subplot(4, 4, 1) # row, column, position
ax1.set_title('Logistic Regression Classification')
ax2 = fig.add_subplot(4, 4, 2) # row, column, position
ax2.set_title('KNN Classification')
ax3 = fig.add_subplot(4, 4, 3)
ax3.set_title('SVM Classification')
ax4 = fig.add_subplot(4, 4, 4)
ax4.set_title('Naive Bayes Classification')
ax5 = fig.add_subplot(4, 4, 5)
ax5.set_title('Random Forest Classification')
ax6 = fig.add_subplot(4, 4, 6)
ax6.set_title('GBM Classification')
ax7 = fig.add_subplot(4, 4, 7)
ax7.set_title('LightGBM Classification')
ax8 = fig.add_subplot(4, 4, 8)
ax8.set_title('XGBoost Classification')
ax9 = fig.add_subplot(4, 4, 9)
ax9.set_title('CatBoost Classification')
ax10 = fig.add_subplot(4, 4, 10)
ax10.set_title('KNN Scaled Classification')
ax11 = fig.add_subplot(4,4, 11)
ax11.set_title('SVC Scaled Classification')
ax12 = fig.add_subplot(4,4, 12)
ax12.set_title('Random Forest Scaled Classification')
ax13 = fig.add_subplot(4, 4, 13)
ax13.set_title('LightGBM Scaled Classification')
sns.heatmap(data=lr_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax1, cmap='magma')
sns.heatmap(data=knn_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax2, cmap='magma')
sns.heatmap(data=svm_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax3, cmap='magma')
sns.heatmap(data=nb_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax4, cmap='magma')
sns.heatmap(data=rf_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax5, cmap='magma')
sns.heatmap(data=gbm_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax6, cmap='magma')
sns.heatmap(data=lgbm_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax7, cmap='magma')
sns.heatmap(data=xgbm_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax8, cmap='magma')
sns.heatmap(data=catb_cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax9, cmap='magma')
sns.heatmap(data=knn_scaled_conf, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax10, cmap='magma')
sns.heatmap(data=svc_scaled_conf, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax11, cmap='magma')
sns.heatmap(data=rf_scaled_conf, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax12, cmap='magma')
sns.heatmap(data=lgbm_scaled_conf, annot=True, linewidth=0.7, linecolor='cyan', fmt='.0f', ax=ax13, cmap='magma')
plt.show()
indexx = ["Log","RF","KNN","SVM","NB","GBM","LightGBM","XGBoost",'CatBoost',"KNN Scaled","SVM Scaled", 'RF Scaled',"LightGBM Scaled"]
regressions = [log_tuned_score,rf_tuned_score,knn_tuned,svc_rbf_score,nb_tuned,gbm_tuned_score,
lgbm_tuned_acc,xgbm_score,catb_final_score,knn_sscore,svm_scaled_score,rf_scaled_score,lgbm_scaled_acc]
plt.figure(figsize=(12,8))
sns.barplot(x=indexx,y=regressions)
plt.xticks(rotation=45)
plt.title('Model Comparision',color = 'green',fontsize=20);
pie_list=regressions
labels=list(zip(indexx,regressions))
fig={
"data":[
{
"values":pie_list,
"labels":labels,
"domain": {"x": [.2, 1]},
"name": "Models-Accuracy Score",
"hoverinfo":"label+percent+name",
"hole": .4,
"type": "pie"
},],
"layout":{
"title":"Accuracy Scores",
"annotations":[
{
"font":{"size":20},
"showarrow": False,
"text": "Model Scores",
"x": 0.60,
"y": 0.50
},
]
}
}
iplot(fig)
Conclusion
LightGBM Boost model has the higest accuracy rate (87.1)
Cat Boost model is really successfull at catching 'true positives' of conf matrix =>>>(2356,73)(307,264)
GBM model is succesfull at catching 'false positives' conf matrix ==> (2339, 90) (297, 274)
Model tuning made quite improvement in all models.
We can experienced different scores for some models with normalization and standart scaler.
The final model is depends if we aiming catching exited members or all members(true-positive/false-positive).